POC modeled VS POC observed

Investigate whether modeled POC is representative of POC observations.

Author

Thelma Panaïotis

source("utils.R")

POC observation data

Read data

Read data, keep columns of interest, rename, drop observations where poc is missing, and keep only sediment traps.

df_raw <- read_delim("data/raw/GO_flux.tab", delim = "\t", escape_double = FALSE, trim_ws = TRUE, skip = 87, show_col_types = FALSE)
# Select columns of interest and rename
df <- df_raw %>%
  select(
    id_ref = `ID (Reference identifier)`,
    id_loc = `ID (Unique location identifier)`,
    type = `Type (Data type)`,
    lon = Longitude,
    lat = Latitude,
    depth = `Depth water [m] (Sediment trap deployment depth)`,
    start_date = `Date/Time (Deployed)`,
    end_date = `Date/time end (Retrieved)`,
    duration = `Duration [days]`,
    poc = `POC flux [mg/m**2/day]`
  ) %>%
  # drop observations where poc is missing
  drop_na(poc) %>% 
  # only sediment traps
  filter(type == "sediment_trap")

TODO: file header mentions 198,789 data points but file only has 15792 rows.

Distribution in space and time

Number of observations at each location.

df %>%
  add_count(lon, lat) %>% 
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = n)) +
  #scale_colour_cmocean(name = "deep") +
  scale_colour_viridis_c(trans = "log10") +
  coord_quickmap(expand = 0)

Depth distribution

ggplot(df) + geom_histogram(aes(x = depth), binwidth = 100)

Time distribution

ggplot(df) + geom_histogram(aes(x = start_date))

ggplot(df) + geom_histogram(aes(x = duration)) + scale_x_log10()

Data points around the 1000 m horizon

Let’s focus on the data around 1000 m depth. For a set of depth range between 50 and 500 m, compute number and map observations.

hor <- 1000
ranges <- lapply(seq(from = 100, to = 500, by = 50), function(W){
  d <- df %>%
    filter(between(depth, hor - W, hor + W)) %>% # keep points within the depth range
    mutate(W = W)
  return(d)
})
ranges <- do.call(bind_rows, ranges)

ranges %>%
  count(W) %>%
  ggplot() +
  geom_col(aes(x = W, y = n)) +
  ggtitle("Number of data points within 1000 ± W depth range")

ranges %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = depth)) +
  scale_colour_cmocean(name = "deep") +
  coord_quickmap(expand = 0) +
  facet_wrap(~W)

Fixed depth range

Let’s use a depth range of 200 meters. Look at the distribution of POC values (transformed and logged).

W <-  200
df <- df %>% 
  filter(between(depth, hor - W, hor + W)) %>% 
  mutate(poc_log = log(poc))
ggplot(df) + geom_histogram(aes(x = poc))

ggplot(df) + geom_histogram(aes(x = poc_log)) # yay

Log-transformed poc is closer to a normal distribution, but if we want to predict modeled poc from observed poc, depending on the prediction model, the distribution of the predictor may not be important.

Let’s now round coordinates to match with Wang et al., 2023 which uses a grid of 2°×2°.

# average by pixel
df_pix <- df %>%
  mutate(
    # floor longitude and add 1 because carbon longitudes are odd
    lon = roundp(lon, precision = 2, f = floor) + 1,
    # round latitude because carbon latitudes are even
    lat = roundp(lat, precision = 2, f = round)
  ) %>%
  group_by(lon, lat) %>%
  summarise(
    poc_mean = mean(poc),
    poc_sd = sd(poc)
  ) %>%
  ungroup()

df_pix %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = poc_mean)) +
  scale_colour_cmocean(name = "matter") +
  coord_quickmap(expand = 0)

df_pix %>%
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = poc_sd)) +
  scale_colour_viridis_c() +
  coord_quickmap(expand = 0)

Let’s check the number of observations per pixel.

df %>%
  mutate(
    # floor longitude and add 1 because carbon longitudes are odd
    lon = roundp(lon, precision = 2, f = floor) + 1,
    # round latitude because carbon latitudes are even
    lat = roundp(lat, precision = 2, f = round)
  ) %>% 
  count(lon, lat) %>% 
  ggplot() +
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat, colour = n)) +
  scale_colour_viridis_c(trans = "log10") +
  coord_quickmap(expand = 0)

Looks like we have 2 timeseries with > 300 observations.

What do we do with these timeseries?

Match with POC modeled data

Read modeled data

load("data/00.carbon_data.Rdata")
df_c <- df_c %>% rename(poc_mod = poc)

ggmap(df_c, var = "poc_mod", type = "raster") +
  scale_fill_cmocean(name = "matter", na.value = NA) +
  geom_point(data = df_pix, aes(x = lon, y = lat), size = 0.5)

Join

Join by coordinates, drop observations where modeled poc is not available.

df_pix <- df_pix %>% 
  left_join(df_c, by = join_by(lon, lat)) %>% 
  drop_na(poc_mod)

Let’s plot a map of our final dataset.

ggplot(df_pix) + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_point(aes(x = lon, y = lat)) +
  coord_quickmap(expand = 0)

Let’s also have a look at the data distribution.

ggplot(df_pix) + geom_histogram(aes(x = poc_mean))

ggplot(df_pix) + geom_histogram(aes(x = poc_mod))

Modeled poc values seem much higher than observed data.

Let’s still plot modeled VS observations, both in untransformed and log-transformed spaces.

ggplot(df_pix) +
  geom_point(aes(x = poc_mean, y = poc_mod)) +
  geom_abline(intercept = 0, slope = 1, colour = "red")

ggplot(df_pix) +
  geom_point(aes(x = poc_mean, y = poc_mod)) +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  scale_x_log10() + 
  scale_y_log10()

Correlation is pretty bad. Will be hard to predict modeled value from observation values. Moreover, modeled poc values at 1000 m are very high compared to those at 100 m presented in Wang et al., 2023.

Important

Let’s try dividing modeled values by 12.

ggplot(df_pix) +
  geom_point(aes(x = poc_mean, y = poc_mod/12)) +
  geom_abline(intercept = 0, slope = 1, colour = "red")

ggplot(df_pix) +
  geom_point(aes(x = poc_mean, y = poc_mod/12)) +
  geom_abline(intercept = 0, slope = 1, colour = "red") +
  scale_x_log10() + 
  scale_y_log10()

Comparison between maps and data from Wang et al.

https://zenodo.org/records/8253973

POC at euphotic depth

Contour plot of non-advective-diffusive vertical flux (mg C m⁻² day⁻¹) exiting the base of the euphotic zone. The results of a are based on the CbPM NPP product and and e-folding remineralization time of 12h for labile DOC.
## Read data as matrix 
d_mat <- readMat("data/raw/Cexp_CbPM_kl12h.mat")$EXP[,,1]$POCexp
#d_mat <- readMat("data/raw/Cexp_CAFE_kl12h.mat")$EXP[,,1]$POCexp

# Generate colnames as longitudes and rownames as latitudes
colnames(d_mat) <- (c(0.5:179.5) * 2)
rownames(d_mat) <- (c(0:90) * 2) - 90

#d_mat[d_mat == 0] <- NA
#median(d_mat, na.rm = TRUE)

## Convert to dataframe
df_c <- d_mat %>%
  as.data.frame() %>%
  rownames_to_column(var = "lat") %>%
  as_tibble() %>%
  pivot_longer(cols = -lat, names_to = "lon", values_to = "poc") %>%
  mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>%
  mutate(lon = ifelse(lon > 180, lon - 360, lon)) %>%
  mutate(poc = ifelse(poc == 0, NA, poc)) %>% # TODO cleaner way to remove lands
  mutate(poc = poc/12) %>% 
  arrange(lon, lat) 
summary(df_c)
      lat           lon              poc          
 Min.   :-90   Min.   :-179.0   Min.   :  -7.277  
 1st Qu.:-46   1st Qu.: -89.5   1st Qu.: 120.706  
 Median :  0   Median :   0.0   Median : 199.131  
 Mean   :  0   Mean   :   0.0   Mean   : 195.880  
 3rd Qu.: 46   3rd Qu.:  89.5   3rd Qu.: 249.788  
 Max.   : 90   Max.   : 179.0   Max.   :1093.429  
                                NA's   :6010      
#ggmap(df_c, var = "poc", type = "raster") + labs(fill = "POCexp")

df_c %>% 
  ggplot() + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = poc)) +
  scale_fill_distiller(palette = "Greens", na.value = NA, direction = 1) +
  coord_quickmap(expand = 0) +
  labs(title = "POCexp / 12")

df_c %>% 
  ggplot() + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = poc)) +
  scale_fill_fermenter(palette = "Greens", na.value = NA, breaks = seq(from = 25, to = 200, by = 25), direction = 1) +
  coord_quickmap(expand = 0) +
  labs(title = "POCexp / 12")

TOC at euphotic depth / 110 m

Distribution of TOC (non-advective-diffusive + advective-diffusive) flux (mg C m−2 day−1) at the base of the euphotic zone
## Read data as matrix 
d_mat <- readMat("data/raw/Cexp_CbPM_kl12h.mat")$EXP[,,1]$TOC110
#d_mat <- readMat("data/raw/Cexp_CAFE_kl12h.mat")$EXP[,,1]$POCexp

# Generate colnames as longitudes and rownames as latitudes
colnames(d_mat) <- (c(0.5:179.5) * 2)
rownames(d_mat) <- (c(0:90) * 2) - 90

#d_mat[d_mat == 0] <- NA
#median(d_mat, na.rm = TRUE)

## Convert to dataframe
df_c <- d_mat %>%
  as.data.frame() %>%
  rownames_to_column(var = "lat") %>%
  as_tibble() %>%
  pivot_longer(cols = -lat, names_to = "lon", values_to = "toc") %>%
  mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>%
  mutate(lon = ifelse(lon > 180, lon - 360, lon)) %>%
  mutate(toc = ifelse(toc == 0, NA, toc)) %>% # TODO cleaner way to remove lands
  mutate(toc = toc/12) %>% 
  arrange(lon, lat) 
summary(df_c)
      lat           lon              toc         
 Min.   :-90   Min.   :-179.0   Min.   :  0.954  
 1st Qu.:-46   1st Qu.: -89.5   1st Qu.:117.814  
 Median :  0   Median :   0.0   Median :182.922  
 Mean   :  0   Mean   :   0.0   Mean   :179.280  
 3rd Qu.: 46   3rd Qu.:  89.5   3rd Qu.:230.017  
 Max.   : 90   Max.   : 179.0   Max.   :956.461  
                                NA's   :5939     
#ggmap(df_c, var = "toc", type = "raster") + labs(fill = "POCexp")

df_c %>% 
  ggplot() + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = toc)) +
  scale_fill_distiller(palette = "Greens", na.value = NA, direction = 1) +
  coord_quickmap(expand = 0) +
  labs(title = "TOC110 / 12")

df_c %>% 
  ggplot() + 
  geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
  geom_raster(aes(x = lon, y = lat, fill = toc)) +
  scale_fill_fermenter(palette = "Greens", na.value = NA, breaks = seq(from = 25, to = 200, by = 25), direction = 1) +
  coord_quickmap(expand = 0) +
  labs(title = "TOC110 / 12")